########################################################
# Project:    Commission Communication
# Task:       Determine number of topics for STM in corpus
#             of Commission PRs
# Author:     Christian Rauh (04.02.2021)
########################################################


# Packages ####
library(tidyverse)
library(quanteda)
library(stm)


# Data prep ####

# Load and filter corpus
com <- read_rds("./Corpora/EC-PressReleases_1985-2020_clean.RDS") %>% 
  filter(text != "") %>% 
  filter(ntokens >= 5) %>% 
  mutate(sentlength = ntokens/nsentences) %>% 
  filter(sentlength <= 100) %>% 
  select(-sentlength)

# # Create DFM with quanteda
# qcorp <- corpus(com$text, docvars = com[ ,c("ipnum", "year")])
# # stemmed, lowered, no stopwords
# # only words that appear in at least 1% and max 99% of docs 
# qdfm <- dfm(qcorp, tolower = T,
#             remove = stopwords("english"),
#             remove_punct = T, 
#             remove_numbers = T,
#             remove_symbols = T,
#             verbose = T) %>% 
#   dfm_trim(min_docfreq = 0.01, max_docfreq = 0.99, docfreq_type = "prop")
# 
# # Inspection
# length(featnames(qdfm))
# freqs <- convert(qdfm, to = "data.frame") %>% 
#   select(-1) %>% 
#   colSums() %>% 
#   as.data.frame()
# 
# docterms <- convert(qdfm, to = "data.frame") %>% 
#   select(-1) %>% 
#   rowSums() %>% 
#   as.data.frame()
# 
# # Convert to tm typed object
# sdfm <- convert(qdfm, to = "stm")
# 
# 
# # Estimate topic models ####
# # for seq(10, 100, 10) it takes 9.3 hours
# # See also: https://juliasilge.com/blog/evaluating-stm/
# 
# start <- Sys.time()
# k <- seq(10, 100, 10) # Number of topics to evaluate
# kresult <- searchK(sdfm$documents, sdfm$vocab, k, verbose = T, proportion = .1)
# Sys.time() - start
# 
# write_rds(kresult, "./Data/SearchK_EC_stemmed_1percentPrune.rds")
# # plot(kresult)

kresult <- read_rds("./Data/SearchK_EC_stemmed_1percentPrune.rds")

result <- data.frame(K = unlist(kresult$results$K),
                     semcoh = unlist(kresult$results$semcoh),
                     exclus = unlist(kresult$results$exclus),
                     heldout = unlist(kresult$results$heldout),
                     residual = unlist(kresult$results$residual),
                     # bound = unlist(kresult$results$bound),
                     lbound = unlist(kresult$results$lbound),
                     em.its = unlist(kresult$results$em.its)) %>% 
  pivot_longer(cols = 2:7)
  
ggplot(result, aes(x=K, y = value))+
  geom_line()+
  geom_vline(xintercept =20, linetype = "solid")+
  geom_vline(xintercept = c(30,40), linetype = "dashed")+
  facet_wrap(.~name, scales = "free_y")+
  scale_x_continuous(breaks = seq(10, 100, 10))+
  theme_bw()

ggsave("./Plots/FigureA3_SearchK_DIAG_EC_stemmed_1percentPrune.png", width = 20, height = 12, units = "cm")

